{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6f5f29ab-3b00-4377-a320-421b6e33386f",
   "metadata": {},
   "source": [
    "#### Objective:- Alternative script that generates points within rectangular bounds in 40min for sanity checks ( Needs to be integrated with cuSpatial for checking points within polygon) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c9f4d54-8bb4-42ce-aa05-a7cb5465472a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf, cupy\n",
    "import pandas as pd, numpy as np\n",
    "import geopandas as gpd\n",
    "# from shapely.geometry import Point, Polygon\n",
    "import os\n",
    "import datetime\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad825f8c-be08-40c6-8b95-fc75f357be7d",
   "metadata": {
    "tags": []
   },
   "source": [
    "### ETL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e9d91de-5c2b-4d5d-8810-f5d74598ca7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data/mapped_data_full.csv',encoding='unicode_escape',dtype={'GISJOIN':'int64','ID20':'int64','STATE':'int32','COUNTY':'str','P20':'int32','P10_new':'int32'}).drop('Unnamed: 0',axis=1)\n",
    "df['P_delta']=df['P20'] - df['eq_P10']\n",
    "df['P_net']= df['P_delta'].apply(lambda x : 1 if x>0 else 0)\n",
    "df['number'] = df.P_delta.round().abs().astype('int32')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91a55f8f-e5bd-4fd5-9d6e-82cab1a7c1cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df =df.to_pandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b6a458e-61dd-4645-a122-366a40fa9f95",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### MAKE function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "186e35dc-e5e8-4820-b02b-6baea50ca749",
   "metadata": {},
   "outputs": [],
   "source": [
    "def Random_Points_in_Bounds(row):  \n",
    "    polygon = row.iloc[0]\n",
    "    number = row.iloc[1]\n",
    "    minx, miny, maxx, maxy = polygon.bounds\n",
    "    x = np.random.uniform( minx, maxx, number )\n",
    "    y = np.random.uniform( miny, maxy, number )\n",
    "    return [x, y]\n",
    "\n",
    "def makeXYpair(row):\n",
    "    l1 = row[0]\n",
    "    l2 = row[1]\n",
    "    return list(map(lambda x, y:[x,y], l1, l2))\n",
    "\n",
    "\n",
    "def exec_data(state_key_list):\n",
    "    c=0\n",
    "    for i in state_key_list:\n",
    "        c+=1\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        path ='data/tl_shapefiles/tl_2021_%s_tabblock20.shp'%(i_str)\n",
    "        print(\"started reading shape file for state \", states[i])\n",
    "        if os.path.isfile(path):    \n",
    "            gpdf = gpd.read_file(path)[['GEOID20', 'geometry']].sort_values('GEOID20').reset_index(drop=True)\n",
    "            gpdf.GEOID20 = gpdf.GEOID20.astype('int64')\n",
    "            print(\"completed reading shape file for state \", states[i])\n",
    "            df_temp = df.query('STATE == @i')[['ID20', 'number','COUNTY','P_delta','P_net']]\n",
    "            merged_df =pd.merge(gpdf,df_temp[['ID20','number']],left_on='GEOID20',right_on='ID20',how='inner')\n",
    "            merged_df = merged_df[merged_df.number!=0].reset_index(drop=True)\n",
    "            merged_df =merged_df.reset_index(drop=True).drop(columns=['GEOID20'])\n",
    "\n",
    "            print(\"starting to generate data for \"+str(states[i])+\"... \")\n",
    "            t1 = datetime.datetime.now()\n",
    "            population_df = pd.DataFrame(merged_df[['geometry','number']].apply(Random_Points_in_Bounds,axis=1),columns=['population'])\n",
    "            points_df = population_df['population'].apply(makeXYpair)\n",
    "            points_df = pd.DataFrame(points_df.explode()).reset_index()\n",
    "            \n",
    "            pop_list =points_df['population'].to_list()\n",
    "            final_df =pd.DataFrame(pop_list,columns=['x','y']).reset_index(drop=True)\n",
    "            \n",
    "            ids = merged_df.ID20.to_list()\n",
    "            number =merged_df.number.to_list()\n",
    "            \n",
    "            rows = []\n",
    "            for id20, n in zip(ids,number):\n",
    "                rows.extend([id20]*n)\n",
    "            \n",
    "            \n",
    "            final_df['ID20'] = pd.Series(rows)\n",
    "            final_df = final_df.sort_values('ID20').reset_index(drop=True)\n",
    "            final_df = pd.merge(final_df,df_temp, on='ID20',how='left')\n",
    "            \n",
    "            \n",
    "            final_df.to_csv('data/migration_files1/migration_%s'%str(states[i])+'.csv', index=False)\n",
    "            print(\"Processing complete for\", states[i])\n",
    "            print('Processing for '+str(states[i])+' complete \\n total time', datetime.datetime.now() - t1)\n",
    "            \n",
    "            del(df_temp)\n",
    "        else:\n",
    "            print(\"shape file does not exist\")\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5e2b4fb-c8e2-48fc-a496-fb0e0b5387a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "#           16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "#           28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "#           40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "#           54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}\n",
    "states= { 12:\"FL\",13:\"GA\",15:\"HI\",16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68872dfa-8eb7-44a3-9bdf-73376f8c28ec",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "exec_data(states.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42b474dd-8db8-48d1-8a66-62bcc8dbad27",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Concat States"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1f2fd86-6a7d-41db-96e9-2665c90bf4c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_shape_and_states(state_key_list):\n",
    "    concat_states = cudf.DataFrame()\n",
    "    \n",
    "    for i in state_key_list:\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        path = 'data/migration_files1/migration_%s'%str(states[i])+'.csv'\n",
    "        if os.path.isfile(path): \n",
    "            temp = cudf.read_csv(path,dtype={'ID20':'int64','x':'float32','y':'float32'})# Load shape files\n",
    "            concat_states = cudf.concat([concat_states,temp])\n",
    "        else:\n",
    "            print(path)\n",
    "            print(\"shape file does not exist\")\n",
    "            continue\n",
    "    return concat_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e88d458c-2016-4076-b513-59e1277f751b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "indv_df = merge_shape_and_states(states.keys())\n",
    "indv_df.rename(columns={'GEOID20':'ID20'},inplace=True)\n",
    "indv_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fdc83845-75c0-4ad0-8538-1abeca2190cc",
   "metadata": {},
   "source": [
    "### Load saved files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d37b71c-fad2-46b5-9324-f0e67bf85a09",
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle.dump(indv_df,open('fulldata_gpu_2','wb'))\n",
    "# indv_df = pickle.load(open('fulldata_gpu','rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d94ea6f0-0c6f-4932-87cb-53ad28b3b57a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# indv_df = indv_df.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebae939f-f03d-478d-ad45-c0fee7670f0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "indv_df = dask_cudf.from_cudf(indv_df, npartitions=2).persist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efe847c5-892c-4bee-bacb-38e57ae56ebb",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# dataset = pd.merge(indv_df,df,on='ID20',how='left')\n",
    "dataset = indv_df.merge(df,on='ID20',how='left') # merge dask dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36e6fb1b-a420-4ba2-8b25-b14f5978ca9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b63b35b-3774-4900-a1c2-1dc6615784eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "del(indv_df)\n",
    "del(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f85b884-4e84-451c-a170-ce25631cc922",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "dataset = dataset.sort_values('ID20')\n",
    "dataset = dataset.drop(columns=['GISJOIN'])\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0a789ed-4c60-43fd-ac76-1ac99a971110",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Viz check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8643ed6-4b19-4733-a2a6-eba71884e700",
   "metadata": {},
   "outputs": [],
   "source": [
    "from holoviews.element.tiles import CartoDark\n",
    "import holoviews as hv\n",
    "from holoviews.operation.datashader import datashade,rasterize,shade\n",
    "from plotly.colors import sequential\n",
    "hv.extension('plotly')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85562104-aef7-48c0-85bd-347316a3f633",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset[\"easting\"], dataset[\"northing\"] = hv.Tiles.lon_lat_to_easting_northing(dataset[\"x\"], dataset[\"y\"])\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8653497-d17e-4c38-b1e8-732d424cae04",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = hv.Dataset(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a19ce9a-9d34-4507-8d60-93dab88dd289",
   "metadata": {},
   "outputs": [],
   "source": [
    "mapbox_token = 'pk.eyJ1IjoibmlzaGFudGoiLCJhIjoiY2w1aXpwMXlkMDEyaDNjczBkZDVjY2l6dyJ9.7oLijsue-xOICmTqNInrBQ'\n",
    "tiles= hv.Tiles().opts(mapboxstyle=\"dark\", accesstoken=mapbox_token)\n",
    "points = datashade(hv.Points(dataset, [\"easting\", \"northing\"]),cmap=sequential.Plasma)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d1fc4fe-e329-4f8c-984b-752b8c87246c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "(tiles*points).opts(width=1800, height=500)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
